import scrapy
import json
import re
import time
from scrapy.selector import Selector
from novel.items import CWMTypeItem
from novel.items import CWMNovelItem
from novel.items import CWMRollItem
from novel.items import CWMChapterItem
from novel.items import CWMContentItem
from novel.tool_get_content import CWMCrawlContent


class NovelCiweimaoSpider(scrapy.Spider):
    name = 'novel_ciweimao'
    allowed_domains = ['ciweimao.com']
    start_urls = ['http://ciweimao.com/']
    url = 'https://ciweimao.com/chapter/get_chapter_list_in_chapter_detail'

    def parse(self, response):
        books = response.xpath('//*[@class="book-list book-list-two"]//li')
        for book in books:
            url = book.xpath("./a/@href").get()
            yield scrapy.Request(url=url, callback=self.parse_novel)

    def parse_novel(self, response):

        type_item = CWMTypeItem()
        type_item['type_title'] = response.xpath('//*[@class="breadcrumb"]/a[last()]/text()').get()
        yield type_item

        novel_item = CWMNovelItem()
        novel_item['novel_title'] = response.xpath('//*[@class="book-info"]/h3/text()').get()
        novel_item['novel_author'] = response.xpath('//*[@class="book-info"]//span/a/text()').get()
        tags = response.css('.label-box span::text').getall()
        tags = [tag.strip() for tag in tags]
        tags = [tag for tag in tags if tag != '']
        tags = ','.join(tags)
        novel_item['novel_tag'] = tags
        novel_item['cover'] = response.xpath('//img[1]/@src').get()
        novel_item['novel_state'] = response.xpath('//p[@class="update-state"]/text()').get()
        novel_item['novel_intro'] = response.xpath('//div[@class="book-desc J_mCustomScrollbar"]/text()').get()
        novel_item['type_title'] = type_item['type_title']
        yield novel_item

        # 得到book_id
        comp = re.compile('book/(.*)')
        book_id = comp.findall(response.url)[0]
        my_data = {'book_id': book_id,
                   'chapter_id': '0',
                   'orderby': '0'
                   }
        yield scrapy.FormRequest(
            url=self.url,
            formdata=my_data,
            callback=self.parse_chapter,
            meta={'novel_title': novel_item['novel_title'], 'book_id': book_id})

    def parse_chapter(self, response):
        # 得到book_id
        book_id = response.meta['book_id']

        novel_title = response.meta['novel_title']
        rolls = response.xpath('//div[@class="book-chapter-box"]')
        for roll in rolls:
            roll_item = CWMRollItem()
            roll_item['roll_title'] = roll.xpath('./h4/text()').get()
            roll_item['novel_title'] = novel_title

            # 判断卷下第一个章节是否是vip章节，是的话后面不用爬了，也不用写入卷了
            locks = roll.xpath('.//ul[1]/li[1]//a/i[@class="icon-lock"]')
            if len(locks) == 0:
                yield roll_item

                chapters = roll.xpath('.//li/a')
                for chapter in chapters:
                    chap_item = CWMChapterItem()
                    chap_item['chap_title'] = chapter.xpath('./text()').get()
                    chap_item['roll_title'] = roll_item['roll_title']
                    timestamp = int(time.time())
                    chap_item['ctime'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(timestamp))
                    # 判断当前章节是否付费
                    chap_lock = chapter.xpath('./i[@class="icon-lock"]')
                    if len(chap_lock) == 0:
                        yield chap_item

                        url = chapter.xpath('./@href').get()
                        yield scrapy.Request(url=url,
                                             callback=self.parse_content,
                                             meta={'chap_title': chap_item['chap_title']})

    def parse_content(self, response):
        chap_title = response.meta['chap_title']
        # 获取chap_id
        comp = re.compile('chapter/(.*)')
        chap_id = comp.findall(response.url)[0]
        try:
            result = CWMCrawlContent.getContent(chap_id)
            print(result)
            contents = Selector(text=result).xpath('//p/text()').getall()
            contents = ''.join(contents)
            content_item = CWMContentItem()
            content_item['content_text'] = contents
            content_item['chap_title'] = chap_title
            yield content_item
        except Exception as e:
            print(e)
